Linear train
split_ratio <- 0.7
set.seed(168)
split_index <- createDataPartition(df_data$cases_new, p=split_ratio, list=FALSE)
data_train <- df_data[split_index,]
data_test <- df_data[-split_index,]
linear_model <- lm(cases_new~cases_active,data=data_train)
summary(linear_model)
##
## Call:
## lm(formula = cases_new ~ cases_active, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3742.5 -215.4 -127.7 221.8 4264.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.286e+02 5.175e+01 2.485 0.0133 *
## cases_active 8.244e-02 6.365e-04 129.506 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 948.8 on 494 degrees of freedom
## Multiple R-squared: 0.9714, Adjusted R-squared: 0.9713
## F-statistic: 1.677e+04 on 1 and 494 DF, p-value: < 2.2e-16
# linear_model <- train(cases_new~date, data=data_train, method="lm", trControl=data_train_control)
plot(linear_model)




linear_prediction <- linear_model %>% predict(data_test)
linear_compare <- data.frame(actual=data_test$cases_new, predicted=linear_prediction)
head(linear_compare)
## actual predicted
## 1 4 128.9024
## 5 3 129.1497
## 8 0 129.2322
## 10 0 129.2322
## 14 1 129.7268
## 16 1 129.7268
data.frame(
R2 = R2(linear_prediction, data_test$cases_new),
RMSE = RMSE(linear_prediction, data_test$cases_new),
MAE = MAE(linear_prediction, data_test$cases_new)
)
## R2 RMSE MAE
## 1 0.9721543 906.3865 545.7654
# Chart init
df_predicted <- data.frame(date=data_test$date, cases_new=linear_prediction)
df_actual <- data_test
df_train <- data_train
lm_chart <- plot_ly()
# Predicted Data
lm_chart <- lm_chart %>%
add_trace(
x = df_predicted[["date"]], y = df_predicted[["cases_new"]],
name = "Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
lm_chart <- lm_chart %>%
add_trace(
x = df_actual[["date"]], y = df_actual[["cases_new"]],
name = "Actual Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
lm_chart <- lm_chart %>%
add_trace(
x = df_train[["date"]], y = df_train[["cases_new"]],
name = "Train Data",
type = "scatter",
mode = "lines",
line = list(color = 'green', width = 2)
)
# Set figure title, x and y-axes titles
lm_chart <- lm_chart %>% layout(
title = "Model Comparison of Daily New Cases",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
lm_chart
Predict future new cases Using Predicted future active cases from ARIMA Model
init_year <- format(as.Date(df_data[1,1], format="%Y-%m-%d"),"%Y")
init_day <- yday(as.Date(df_data[1,1], format="%Y-%m-%d"))
data_arima <- ts(df_data$cases_active, start=c(init_year,init_day), frequency=365)
head(data_arima)
## Time Series:
## Start = c(2020, 25)
## End = c(2020, 30)
## Frequency = 365
## [1] 4 4 4 4 7 8
arima_model <- auto.arima(df_data$cases_active, trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)
##
## ARIMA(0,1,0) : 12574.82
## ARIMA(0,1,0) with drift : 12576.06
## ARIMA(0,1,1) : 12201.06
## ARIMA(0,1,1) with drift : 12202.55
## ARIMA(0,1,2) : 12000.65
## ARIMA(0,1,2) with drift : 12002.3
## ARIMA(0,1,3) : 11949.82
## ARIMA(0,1,3) with drift : 11951.54
## ARIMA(0,1,4) : 11892.69
## ARIMA(0,1,4) with drift : 11894.47
## ARIMA(0,1,5) : 11833.57
## ARIMA(0,1,5) with drift : 11835.4
## ARIMA(1,1,0) : 11789.02
## ARIMA(1,1,0) with drift : 11790.96
## ARIMA(1,1,1) : 11637.72
## ARIMA(1,1,1) with drift : 11639.74
## ARIMA(1,1,2) : 11636.15
## ARIMA(1,1,2) with drift : 11638.18
## ARIMA(1,1,3) : 11637.46
## ARIMA(1,1,3) with drift : 11639.49
## ARIMA(1,1,4) : 11612.64
## ARIMA(1,1,4) with drift : 11614.67
## ARIMA(2,1,0) : 11710.42
## ARIMA(2,1,0) with drift : 11712.4
## ARIMA(2,1,1) : 11636.58
## ARIMA(2,1,1) with drift : 11638.61
## ARIMA(2,1,2) : 11638.07
## ARIMA(2,1,2) with drift : 11640.11
## ARIMA(2,1,3) : Inf
## ARIMA(2,1,3) with drift : 11640.24
## ARIMA(3,1,0) : 11684.41
## ARIMA(3,1,0) with drift : 11686.42
## ARIMA(3,1,1) : 11636.13
## ARIMA(3,1,1) with drift : 11638.16
## ARIMA(3,1,2) : Inf
## ARIMA(3,1,2) with drift : Inf
## ARIMA(4,1,0) : 11635.54
## ARIMA(4,1,0) with drift : 11637.56
## ARIMA(4,1,1) : 11623.02
## ARIMA(4,1,1) with drift : 11625.06
## ARIMA(5,1,0) : 11626.8
## ARIMA(5,1,0) with drift : 11628.84
##
##
##
## Best model: ARIMA(1,1,4)
## Series: df_data$cases_active
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.9737 -0.6159 -0.1013 -0.0863 0.2176
## s.e. 0.0090 0.0383 0.0443 0.0453 0.0415
##
## sigma^2 estimated as 785818: log likelihood=-5800.26
## AIC=11612.52 AICc=11612.64 BIC=11639.89
forecast_length <- 30
arima_predict <- forecast(arima_model, forecast_length)
head(arima_predict)
## $method
## [1] "ARIMA(1,1,4)"
##
## $model
## Series: df_data$cases_active
## ARIMA(1,1,4)
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4
## 0.9737 -0.6159 -0.1013 -0.0863 0.2176
## s.e. 0.0090 0.0383 0.0443 0.0453 0.0415
##
## sigma^2 estimated as 785818: log likelihood=-5800.26
## AIC=11612.52 AICc=11612.64 BIC=11639.89
##
## $level
## [1] 80 95
##
## $mean
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## [1] 40614.16 40307.63 40048.57 39882.01 39719.83 39561.92 39408.15 39258.42
## [9] 39112.62 38970.65 38832.42 38697.81 38566.74 38439.11 38314.84 38193.83
## [17] 38076.00 37961.26 37849.54 37740.75 37634.82 37531.67 37431.23 37333.44
## [25] 37238.21 37145.48 37055.18 36967.26 36881.65 36798.29
##
## $lower
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## 80% 95%
## 709 39478.114 38876.7256
## 710 38391.848 37377.6936
## 711 37403.746 36003.6634
## 712 36566.876 34811.9487
## 713 35617.312 33445.5671
## 714 34581.553 31945.1080
## 715 33477.514 30338.0259
## 716 32317.611 28643.3705
## 717 31110.863 26874.9883
## 718 29864.124 25043.4190
## 719 28582.818 23157.0088
## 720 27271.371 21222.5802
## 721 25933.490 19245.8519
## 722 24572.339 17231.7126
## 723 23190.662 15184.4066
## 724 21790.866 13107.6638
## 725 20375.089 11004.7946
## 726 18945.235 8878.7600
## 727 17503.021 6732.2261
## 728 16049.997 4567.6059
## 729 14587.570 2387.0935
## 730 13117.025 192.6909
## 731 11639.533 -2013.7690
## 732 10156.171 -4230.6039
## 733 8667.926 -6456.2660
## 734 7175.708 -8689.3286
## 735 5680.356 -10928.4742
## 736 4182.644 -13172.4849
## 737 2683.288 -15420.2328
## 738 1182.949 -17670.6729
##
## $upper
## Time Series:
## Start = 709
## End = 738
## Frequency = 1
## 80% 95%
## 709 41750.21 42351.60
## 710 42223.42 43237.57
## 711 42693.38 44093.47
## 712 43197.15 44952.08
## 713 43822.36 45994.10
## 714 44542.28 47178.73
## 715 45338.78 48478.27
## 716 46199.22 49873.46
## 717 47114.38 51350.25
## 718 48077.18 52897.89
## 719 49082.01 54507.82
## 720 50124.25 56173.04
## 721 51199.99 57887.63
## 722 52305.88 59646.51
## 723 53439.01 61445.27
## 724 54596.79 63279.99
## 725 55776.90 65147.20
## 726 56977.28 67043.76
## 727 58196.05 68966.85
## 728 59431.50 70913.89
## 729 60682.07 72882.55
## 730 61946.32 74870.65
## 731 63222.94 76876.24
## 732 64510.70 78897.48
## 733 65808.48 80932.68
## 734 67115.25 82980.28
## 735 68430.01 85038.84
## 736 69751.88 87107.01
## 737 71080.02 89183.54
## 738 72413.63 91267.25
plot(arima_predict, main = "Predicted Active Cases", col.main = "black")

last_date <- as.Date(df_data[(nrow(df_data)):nrow(df_data),1], format="%Y-%m-%d")
last_date <- last_date + 1
df_arima <- data.frame(
date=seq(last_date, by = "day", length.out = forecast_length),
cases_active=arima_predict$mean
)
predict(linear_model, newdata=df_arima)
## 1 2 3 4 5 6 7 8
## 3476.616 3451.347 3429.991 3416.261 3402.892 3389.874 3377.197 3364.854
## 9 10 11 12 13 14 15 16
## 3352.836 3341.133 3329.737 3318.641 3307.836 3297.315 3287.070 3277.095
## 17 18 19 20 21 22 23 24
## 3267.381 3257.923 3248.713 3239.745 3231.013 3222.510 3214.230 3206.168
## 25 26 27 28 29 30
## 3198.318 3190.673 3183.230 3175.982 3168.925 3162.053
combined_prediction <- linear_model %>% predict(df_arima)
df_combined_predicted <- data.frame(date=df_arima$date, cases_new=combined_prediction)
df_data$month <- strftime(df_data$date, "%m")
df_data$year <- strftime(df_data$date, "%Y")
df_smooth <- df_data %>%
group_by(date=lubridate::floor_date(df_data$date, "month")) %>%
dplyr::summarize(cases_new = mean(cases_new)) %>%
data.frame
combined_chart <- plot_ly()
# Predicted Data
combined_chart <- combined_chart %>%
add_trace(
x = df_combined_predicted[["date"]], y = df_combined_predicted[["cases_new"]],
name = "Future Predicted Data",
type = 'scatter',
mode = 'lines',
line = list(color = 'red', width = 3)
)
# Test Data
combined_chart <- combined_chart %>%
add_trace(
x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
name = "Actual Data (Rolled to Monthly)",
type = 'scatter',
mode = 'lines',
line = list(color = 'skyblue', width = 3)
)
# Set figure title, x and y-axes titles
combined_chart <- combined_chart %>% layout(
title = "Prediction of Daily New Cases",
xaxis = list(title="Recorded Time"),
yaxis = list(title="Daily Count of New Cases")
)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff')
)
combined_chart
End of document